Load raw data (from https://www.kaggle.com/ludobenistant/hr-analytics), add factors, save.
HR <- read_csv("HR_comma_sep.csv", col_types=cols(
satisfaction_level = col_double(),
last_evaluation = col_double(),
number_project = col_integer(),
average_montly_hours = col_integer(),
time_spend_company = col_integer(),
Work_accident = col_integer(),
left = col_integer(),
promotion_last_5years = col_integer(),
sales = col_character(),
salary = col_character()
))
HR <- plyr::rename(HR, replace=c("satisfaction_level"="Satis", "last_evaluation"="Eval",
"number_project"="NumProj", "average_montly_hours"="MonHrs",
"time_spend_company"="Tenure", "Work_accident"="Accdt01",
"left"="Left01", "promotion_last_5years"="Promo5yr01",
"sales"="Dept", "salary"="Salary"))
HR$Salary <- factor(HR$Salary, levels=c("low", "medium", "high"), ordered=TRUE)
HR$Dept <- factor(HR$Dept)
HR$Accdt <- factor(HR$Accdt01 == 1)
HR$Promo5yr <- factor(HR$Promo5yr01 == 1)
HR$Left <- factor(HR$Left01==1)
set.seed(42)
HR$location <- factor(sapply(HR$Dept, location))
write_csv(HR, "HR.csv")
summary(HR)
## Satis Eval NumProj MonHrs
## Min. :0.0900 Min. :0.3600 Min. :2.000 Min. : 96.0
## 1st Qu.:0.4400 1st Qu.:0.5600 1st Qu.:3.000 1st Qu.:156.0
## Median :0.6400 Median :0.7200 Median :4.000 Median :200.0
## Mean :0.6128 Mean :0.7161 Mean :3.803 Mean :201.1
## 3rd Qu.:0.8200 3rd Qu.:0.8700 3rd Qu.:5.000 3rd Qu.:245.0
## Max. :1.0000 Max. :1.0000 Max. :7.000 Max. :310.0
##
## Tenure Accdt01 Left01 Promo5yr01
## Min. : 2.000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.: 3.000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median : 3.000 Median :0.0000 Median :0.0000 Median :0.00000
## Mean : 3.498 Mean :0.1446 Mean :0.2381 Mean :0.02127
## 3rd Qu.: 4.000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :10.000 Max. :1.0000 Max. :1.0000 Max. :1.00000
##
## Dept Salary Accdt Promo5yr
## sales :4140 low :7316 FALSE:12830 FALSE:14680
## technical :2720 medium:6446 TRUE : 2169 TRUE : 319
## support :2229 high :1237
## IT :1227
## product_mng: 902
## marketing : 858
## (Other) :2923
## Left location
## FALSE:11428 Brisbane :1801
## TRUE : 3571 London :8925
## San Francisco:4273
##
##
##
##
plot.new()
png(filename="hr-scatter.png", res=300, width = 3000, height = 3000)
my_colors <- brewer.pal(3, "Set2")
samp.sz <- 3750
pct <- round(100*samp.sz/nrow(HR))
data <- HR[sample(nrow(HR), samp.sz),]
scatterplotMatrix(~Satis+Eval+NumProj+MonHrs+Tenure+Accdt01+Promo5yr01+Dept+Salary|Left01, data=data, reg.line="", smoother="", col=my_colors , smoother.args=list(col="grey") , pch=c(3,4), legend.plot=FALSE, main=paste("Scatter Plot Pairs for HR Turnover (sampling ",pct,"% of data)", sep=""))
par(xpd=TRUE, cex=0.7)
Ly <- 1.16 # for inline HTML
Ly <- 1.05 # for PNG
legend(x=0.91, y=Ly, c("Stay", "Leave"), col=my_colors, pch=c(3,4), horiz=TRUE)
dev.off()
Correlations between all pairs of numeric dimensions are investigated over the full dataset, and within the Leaving and Staying subsets.
P-values (and optionally confidence intervals) for all pairs are calculated with the following formula. Correlations not passing the significance test (p<0.01) will be marked with an ‘X’ in the matrix.
cor.mtest <- function(mat, conf.level = 0.95){
mat <- as.matrix(mat)
n <- ncol(mat)
p.mat <- lowCI.mat <- uppCI.mat <- matrix(NA, n, n)
diag(p.mat) <- 0
diag(lowCI.mat) <- diag(uppCI.mat) <- 1
for(i in 1:(n-1)){
for(j in (i+1):n){
tmp <- cor.test(mat[,i], mat[,j], conf.level = conf.level)
p.mat[i,j] <- p.mat[j,i] <- tmp$p.value
lowCI.mat[i,j] <- lowCI.mat[j,i] <- tmp$conf.int[1]
uppCI.mat[i,j] <- uppCI.mat[j,i] <- tmp$conf.int[2]
}
}
return(list(p.mat, lowCI.mat, uppCI.mat))
}
Leaving (Left01) is negatively correlated with:
Leaving is positively correlated with:
Other significant correlations include:
DF <- HR[1:8]
M <- cor(DF)
pval99 <- cor.mtest(DF,0.99)
corrplot(M, p.mat = pval99[[1]], sig.level=0.01, method = 'pie', order ="hclust", addrect=2,
tl.col="black", tl.cex = 1, tl.offset = 0.1, tl.srt = 45)
DF <- HR[HR$Left==FALSE,c(1:6,8)]
M <- cor(DF)
pval99 <- cor.mtest(DF,0.99)
corrplot(M, p.mat = pval99[[1]], sig.level=0.01, method = 'pie', order ="hclust", addrect=2,
tl.col="black", tl.cex = 1, tl.offset = 0.1, tl.srt = 45)
For staff who leave: * Satisfaction is negatively correlated with the number of projects (-0.23) and the monthly hours worked (-0.08) * Satisfaction is positively correlated with tenure (0.44) and evaluation (0.18) * Evaluations are positively correlated with monthly hours worked (0.83), the number of projects worked (0.80), tenure (0.78) and job satisfaction (0.18) * Tenure is positively correlated with evaluation (0.78), monthly hours worked (0.66), number of projects worked (0.60) and job satisfaction (0.44)
DF <- HR[HR$Left==TRUE,c(1:6,8)]
M <- cor(DF)
pval99 <- cor.mtest(DF,0.99)
corrplot(M, p.mat = pval99[[1]], sig.level=0.01, method = 'pie', order ="hclust", addrect=2,
tl.col="black", tl.cex = 1, tl.offset = 0.1, tl.srt = 45)
Many people leaving have been with the company at least 4 years but have not had a promotion in the last 5 years, despite working long hours.
with(HR, coplot(MonHrs ~ jitter(Tenure) |
Promo5yr + Left))
These unpromoted, long-term workers who leave are receiving the highest evaluations.
with(HR, coplot(Eval ~ jitter(Tenure) |
Promo5yr + Left))
These unpromoted, long-term workers who leave are also reporting high levels of satisfaction with the company.
with(HR, coplot(Satis ~ jitter(Tenure) |
Promo5yr + Left))
Of the unpromoted workers putting in long hours who are leaving, some report the lowest levels of satisfaction, but more report very high levels of satisfaction.
The ones who are working the fewest hours report fairly low satisfaction.
with(HR, coplot(MonHrs ~ Satis |
Promo5yr + Left))
The staff on the largest number of projects and who put in the longest hours, with no promotion in 5 years, tend to leave.
with(HR, coplot(MonHrs ~ jitter(NumProj) |
Promo5yr + Left))
Of the unpromoted workers who left, many had both high evaluations and high levels of satisfaction.
with(HR, coplot(Eval ~ Satis |
Promo5yr + Left))
Plot: evaluation, satisfaction | hours + Left evaluation, satisfaction | time with co + Left evaluation, satisfaction | NumProj + Left